{
  "cells": [
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "cHgpJN1uzD8B"
      },
      "source": [
        "# Tutorial on pre-training of Chinese-LLaMA-7B\n",
        "\n",
        "More info: https://github.com/ymcui/Chinese-LLaMA-Alpaca"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "2meQHBlHxcsi"
      },
      "source": [
        "## Install Dependencies"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "quRXOPaZwmwz",
        "outputId": "6d9febc4-b0d2-41ee-ce66-9284758e928e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting transformers==4.28.1\n",
            "  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m83.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (3.12.0)\n",
            "Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.1)\n",
            "  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.5/224.5 kB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (1.22.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (23.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (6.0)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (2022.10.31)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (2.27.1)\n",
            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)\n",
            "  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m105.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.28.1) (4.65.0)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers==4.28.1) (2023.4.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers==4.28.1) (4.5.0)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.28.1) (1.26.15)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.28.1) (2022.12.7)\n",
            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.28.1) (2.0.12)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.28.1) (3.4)\n",
            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
            "Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting git+https://github.com/huggingface/peft.git@13e53fc\n",
            "  Cloning https://github.com/huggingface/peft.git (to revision 13e53fc) to /tmp/pip-req-build-6fdyuocz\n",
            "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-6fdyuocz\n",
            "\u001b[33m  WARNING: Did not find branch or tag '13e53fc', assuming revision or ref.\u001b[0m\u001b[33m\n",
            "\u001b[0m  Running command git checkout -q 13e53fc\n",
            "  Resolved https://github.com/huggingface/peft.git to commit 13e53fc\n",
            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (1.22.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (23.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (5.9.5)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (6.0)\n",
            "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (2.0.0+cu118)\n",
            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.3.0.dev0) (4.28.1)\n",
            "Collecting accelerate (from peft==0.3.0.dev0)\n",
            "  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m219.1/219.1 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.12.0)\n",
            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (4.5.0)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (1.11.1)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (3.1.2)\n",
            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.3.0.dev0) (2.0.0)\n",
            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (3.25.2)\n",
            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.3.0.dev0) (16.0.3)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.14.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2022.10.31)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (2.27.1)\n",
            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (0.13.3)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.3.0.dev0) (4.65.0)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers->peft==0.3.0.dev0) (2023.4.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.3.0.dev0) (2.1.2)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (1.26.15)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2022.12.7)\n",
            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (2.0.12)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.3.0.dev0) (3.4)\n",
            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.3.0.dev0) (1.3.0)\n",
            "Building wheels for collected packages: peft\n",
            "  Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for peft: filename=peft-0.3.0.dev0-py3-none-any.whl size=40652 sha256=45f5ed4c68d8fd79aafc4e46cab06b9f7825ddaf6ffbd65c3b9a48aff09aef7c\n",
            "  Stored in directory: /tmp/pip-ephem-wheel-cache-2yuu1wzt/wheels/d9/13/c6/404d5f8a81c5620f65f7fd75b6a66619f013cd79c2875b981c\n",
            "Successfully built peft\n",
            "Installing collected packages: accelerate, peft\n",
            "Successfully installed accelerate-0.19.0 peft-0.3.0.dev0\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting datasets\n",
            "  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m474.6/474.6 kB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.22.4)\n",
            "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n",
            "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n",
            "  Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n",
            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.27.1)\n",
            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.65.0)\n",
            "Collecting xxhash (from datasets)\n",
            "  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting multiprocess (from datasets)\n",
            "  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.4.0)\n",
            "Collecting aiohttp (from datasets)\n",
            "  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m71.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: huggingface-hub<1.0.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.14.1)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.1)\n",
            "Collecting responses<0.19 (from datasets)\n",
            "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n",
            "Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)\n",
            "  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->datasets)\n",
            "  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
            "Collecting yarl<2.0,>=1.0 (from aiohttp->datasets)\n",
            "  Downloading yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m35.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting frozenlist>=1.1.1 (from aiohttp->datasets)\n",
            "  Downloading frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (149 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting aiosignal>=1.1.2 (from aiohttp->datasets)\n",
            "  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (3.12.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (4.5.0)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (1.26.15)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.4)\n",
            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
            "Installing collected packages: xxhash, multidict, frozenlist, dill, async-timeout, yarl, responses, multiprocess, aiosignal, aiohttp, datasets\n",
            "Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.12.0 dill-0.3.6 frozenlist-1.3.3 multidict-6.0.4 multiprocess-0.70.14 responses-0.18.0 xxhash-3.2.0 yarl-1.9.2\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting sentencepiece\n",
            "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: sentencepiece\n",
            "Successfully installed sentencepiece-0.1.99\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting deepspeed\n",
            "  Downloading deepspeed-0.9.2.tar.gz (779 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m779.3/779.3 kB\u001b[0m \u001b[31m28.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Collecting hjson (from deepspeed)\n",
            "  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting ninja (from deepspeed)\n",
            "  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m146.0/146.0 kB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from deepspeed) (1.22.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from deepspeed) (23.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from deepspeed) (5.9.5)\n",
            "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from deepspeed) (9.0.0)\n",
            "Requirement already satisfied: pydantic<2.0.0 in /usr/local/lib/python3.10/dist-packages (from deepspeed) (1.10.7)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from deepspeed) (2.0.0+cu118)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from deepspeed) (4.65.0)\n",
            "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<2.0.0->deepspeed) (4.5.0)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->deepspeed) (3.12.0)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->deepspeed) (1.11.1)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->deepspeed) (3.1)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->deepspeed) (3.1.2)\n",
            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->deepspeed) (2.0.0)\n",
            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->deepspeed) (3.25.2)\n",
            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->deepspeed) (16.0.3)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->deepspeed) (2.1.2)\n",
            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->deepspeed) (1.3.0)\n",
            "Building wheels for collected packages: deepspeed\n",
            "  Building wheel for deepspeed (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for deepspeed: filename=deepspeed-0.9.2-py3-none-any.whl size=811216 sha256=493b7db5f75d0669b1e656a7ae283c973e2e11ee59959cbd7f71ceb648bdfa27\n",
            "  Stored in directory: /root/.cache/pip/wheels/a6/d2/b1/b15210b5dc024bab4eccbac2148db29959fe01fe6042557d07\n",
            "Successfully built deepspeed\n",
            "Installing collected packages: ninja, hjson, deepspeed\n",
            "Successfully installed deepspeed-0.9.2 hjson-3.1.0 ninja-1.11.1\n"
          ]
        }
      ],
      "source": [
        "!pip install transformers==4.28.1\n",
        "!pip install git+https://github.com/huggingface/peft.git@13e53fc\n",
        "!pip install datasets\n",
        "!pip install sentencepiece\n",
        "!pip install deepspeed"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "ji21WFqexASI"
      },
      "source": [
        "## Clone our repository\n",
        "\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dulrlPMexFNN",
        "outputId": "b9168c88-e72e-4f18-9450-d28832f7fe58"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Cloning into 'Chinese-LLaMA-Alpaca'...\n",
            "remote: Enumerating objects: 911, done.\u001b[K\n",
            "remote: Counting objects: 100% (352/352), done.\u001b[K\n",
            "remote: Compressing objects: 100% (233/233), done.\u001b[K\n",
            "remote: Total 911 (delta 135), reused 211 (delta 117), pack-reused 559\u001b[K\n",
            "Receiving objects: 100% (911/911), 18.13 MiB | 10.57 MiB/s, done.\n",
            "Resolving deltas: 100% (527/527), done.\n"
          ]
        }
      ],
      "source": [
        "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca.git"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "pqH_h_ZAz4_e"
      },
      "source": [
        "## Pre-training for LLaMA-7B\n",
        "\n",
        "This follows the setting in https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Pretraining-Script, except that to simplify the tutorial,\n",
        "- only train 100 steps\n",
        "- use a sample data file built from alpaca_data_zh_51k.json"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "!mkdir Chinese-LLaMA-Alpaca/pt_data\n",
        "!cp Chinese-LLaMA-Alpaca/data/pt_sample_data.txt Chinese-LLaMA-Alpaca/pt_data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IhrAVNUKSw9_",
        "outputId": "4f240f2b-2396-405b-9a94-84446db8d3e6"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "2023-05-12 06:15:17.883035: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
            "[2023-05-12 06:15:18,834] [INFO] [comm.py:622:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
            "05/12/2023 06:15:21 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True\n",
            "[INFO|configuration_utils.py:668] 2023-05-12 06:15:21,697 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/config.json\n",
            "[INFO|configuration_utils.py:720] 2023-05-12 06:15:21,698 >> Model config LlamaConfig {\n",
            "  \"_name_or_path\": \"decapoda-research/llama-7b-hf\",\n",
            "  \"architectures\": [\n",
            "    \"LLaMAForCausalLM\"\n",
            "  ],\n",
            "  \"bos_token_id\": 0,\n",
            "  \"eos_token_id\": 1,\n",
            "  \"hidden_act\": \"silu\",\n",
            "  \"hidden_size\": 4096,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"intermediate_size\": 11008,\n",
            "  \"max_position_embeddings\": 2048,\n",
            "  \"max_sequence_length\": 2048,\n",
            "  \"model_type\": \"llama\",\n",
            "  \"num_attention_heads\": 32,\n",
            "  \"num_hidden_layers\": 32,\n",
            "  \"pad_token_id\": -1,\n",
            "  \"rms_norm_eps\": 1e-06,\n",
            "  \"tie_word_embeddings\": false,\n",
            "  \"torch_dtype\": \"float16\",\n",
            "  \"transformers_version\": \"4.28.1\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 32000\n",
            "}\n",
            "\n",
            "[INFO|tokenization_utils_base.py:1809] 2023-05-12 06:15:21,947 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--ziqingyang--chinese-llama-lora-7b/snapshots/b5e520ae0a1282c6105a72ad6063a3b3de211067/tokenizer.model\n",
            "[INFO|tokenization_utils_base.py:1809] 2023-05-12 06:15:21,947 >> loading file added_tokens.json from cache at None\n",
            "[INFO|tokenization_utils_base.py:1809] 2023-05-12 06:15:21,947 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--ziqingyang--chinese-llama-lora-7b/snapshots/b5e520ae0a1282c6105a72ad6063a3b3de211067/special_tokens_map.json\n",
            "[INFO|tokenization_utils_base.py:1809] 2023-05-12 06:15:21,947 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--ziqingyang--chinese-llama-lora-7b/snapshots/b5e520ae0a1282c6105a72ad6063a3b3de211067/tokenizer_config.json\n",
            "05/12/2023 06:15:22 - INFO - datasets.builder - Using custom data configuration default-6e69300db3ed54e6\n",
            "05/12/2023 06:15:22 - INFO - datasets.info - Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/text\n",
            "05/12/2023 06:15:22 - INFO - datasets.builder - Generating dataset text (/content/Chinese-LLaMA-Alpaca/scripts/data_cache/pt_sample_data_text/text/default-6e69300db3ed54e6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n",
            "Downloading and preparing dataset text/default to /content/Chinese-LLaMA-Alpaca/scripts/data_cache/pt_sample_data_text/text/default-6e69300db3ed54e6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...\n",
            "Downloading data files: 100% 1/1 [00:00<00:00, 7667.83it/s]\n",
            "05/12/2023 06:15:22 - INFO - datasets.download.download_manager - Downloading took 0.0 min\n",
            "05/12/2023 06:15:22 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min\n",
            "Extracting data files: 100% 1/1 [00:00<00:00, 1421.32it/s]\n",
            "05/12/2023 06:15:22 - INFO - datasets.builder - Generating train split\n",
            "05/12/2023 06:15:23 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.\n",
            "Dataset text downloaded and prepared to /content/Chinese-LLaMA-Alpaca/scripts/data_cache/pt_sample_data_text/text/default-6e69300db3ed54e6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.\n",
            "100% 1/1 [00:00<00:00, 218.67it/s]\n",
            "05/12/2023 06:15:23 - INFO - __main__ - pt_sample_data.txt has been loaded\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #0 will write at data_cache/pt_sample_data_text/tokenized_00000_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #1 will write at data_cache/pt_sample_data_text/tokenized_00001_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #2 will write at data_cache/pt_sample_data_text/tokenized_00002_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #3 will write at data_cache/pt_sample_data_text/tokenized_00003_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #4 will write at data_cache/pt_sample_data_text/tokenized_00004_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #5 will write at data_cache/pt_sample_data_text/tokenized_00005_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #6 will write at data_cache/pt_sample_data_text/tokenized_00006_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Process #7 will write at data_cache/pt_sample_data_text/tokenized_00007_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Spawning 8 processes\n",
            "Running tokenizer on dataset (num_proc=8):   0% 0/125987 [00:00<?, ? examples/s]05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00003_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00000_of_00008.arrow\n",
            "Running tokenizer on dataset (num_proc=8):   1% 1000/125987 [00:00<00:29, 4287.42 examples/s]05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00005_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00004_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00002_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00001_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00006_of_00008.arrow\n",
            "05/12/2023 06:15:23 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/tokenized_00007_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Concatenating 8 shards\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #0 will write at data_cache/pt_sample_data_text/grouped_00000_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #1 will write at data_cache/pt_sample_data_text/grouped_00001_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #2 will write at data_cache/pt_sample_data_text/grouped_00002_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #3 will write at data_cache/pt_sample_data_text/grouped_00003_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #4 will write at data_cache/pt_sample_data_text/grouped_00004_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #5 will write at data_cache/pt_sample_data_text/grouped_00005_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #6 will write at data_cache/pt_sample_data_text/grouped_00006_of_00008.arrow\n",
            "05/12/2023 06:15:26 - INFO - datasets.arrow_dataset - Process #7 will write at data_cache/pt_sample_data_text/grouped_00007_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Spawning 8 processes\n",
            "Grouping texts in chunks of 512 (num_proc=8):   0% 0/125987 [00:00<?, ? examples/s]05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00004_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00002_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00003_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00000_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00001_of_00008.arrow\n",
            "Grouping texts in chunks of 512 (num_proc=8):   1% 1000/125987 [00:00<00:14, 8900.08 examples/s]05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00005_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00006_of_00008.arrow\n",
            "05/12/2023 06:15:27 - INFO - datasets.arrow_dataset - Caching processed dataset at data_cache/pt_sample_data_text/grouped_00007_of_00008.arrow\n",
            "05/12/2023 06:15:28 - INFO - datasets.arrow_dataset - Concatenating 8 shards\n",
            "05/12/2023 06:15:28 - INFO - datasets.arrow_dataset - Caching indices mapping at /content/Chinese-LLaMA-Alpaca/scripts/data_cache/pt_sample_data_text/cache-2e6939e892c734e7.arrow\n",
            "05/12/2023 06:15:28 - INFO - datasets.arrow_dataset - Caching indices mapping at /content/Chinese-LLaMA-Alpaca/scripts/data_cache/pt_sample_data_text/cache-058d2fd64a628682.arrow\n",
            "05/12/2023 06:15:28 - INFO - __main__ - Num train_samples  6906\n",
            "05/12/2023 06:15:28 - INFO - __main__ - training example:\n",
            "05/12/2023 06:15:28 - INFO - __main__ -  学生的自尊心和自我激励。<s><s> 最后，音乐教育有助于教育学生了解其他文化和世界。通过学习和演奏其他文化的乐曲，学生可以了解他们在其他情况下不会听到的不同音效和节奏的细微差别。例如，学习一首来自印度的音乐将让学生接触到以前从未遇到过的节奏和音阶的新思想。他们可以深入了解一种他们不熟悉的文化，并拓宽自己的世界观。<s><s> 总之，音乐教育是学生整体教育经验中的必要组成部分。它为他们提供学习、创造和表达自己的机会，同时也提供了一个平台，建立身份、信心和文化意识。音乐教育在当今世界是至关重要的，因为它让学生有机会探索自己的才能，并帮助他们成长为充满自信的个体。<s> 从以下句子中找出名字。我的兄弟汤姆和姐姐简要来拜访。汤姆<s> 简<s> 列出五种有羽毛的动物。鸡，鸭，老鹰，孔雀，蜂鸟。<s> 阅读说明，然后编写待办事项清单。去购物，收拾你的卧室，完成项目报告。待办事项清单：<s> 1. 去购物<s> 2. 收拾你的卧室<s> 3. 完成项目报告<s> 使用给定的单词之一生成唯一密码。DogD0g@P4ssW0rd<s> 发一条推文，谈谈环保的重要性。关注环保是人类健康和可持续发展的重要保证。让我们更加谨慎地行动，保护我们的星球！#环保 #可持续发展 #绿色生活<s> 使用给定单词中的一个创建电影标题。Cloud命运之云<s> 列举三个志愿参与对事业有益的原因。志愿参与对事业有益的原因有很多。首先，它使个人能够在世界上产生积极的影响。其次，它帮助个人建立对目标的同理心、热情和承诺。最后，还是结识新朋友和建立有意义的联系的好方法。<s> 填写空白以使下列句子变为真实句子：<s><s> David 获得了一份 _______ 学士学位。计算机科学David 获得了一份计算机科学学士学位。<s> 重新组织句子，使其结构反映以下结构：主语 - 谓语 - 宾语Sleeping has been my hobby for years.我已经睡觉多年了。<s>创建一条带有#SmallActionsMakeADifference标签的推文，以促进可持续发展。#SmallActionsMakeADifference！投资于节能电器，使用可重复\n",
            "Downloading (…)model.bin.index.json: 100% 25.5k/25.5k [00:00<00:00, 80.3MB/s]\n",
            "[INFO|modeling_utils.py:2534] 2023-05-12 06:15:28,999 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/pytorch_model.bin.index.json\n",
            "Downloading shards:   0% 0/33 [00:00<?, ?it/s]\n",
            "Downloading (…)l-00001-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  10% 41.9M/405M [00:00<00:00, 391MB/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  23% 94.4M/405M [00:00<00:00, 451MB/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  36% 147M/405M [00:00<00:00, 414MB/s] \u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  52% 210M/405M [00:00<00:00, 450MB/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  65% 262M/405M [00:00<00:00, 432MB/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin:  80% 325M/405M [00:00<00:00, 468MB/s]\u001b[A\n",
            "Downloading (…)l-00001-of-00033.bin: 100% 405M/405M [00:00<00:00, 472MB/s]\n",
            "Downloading shards:   3% 1/33 [00:01<00:35,  1.12s/it]\n",
            "Downloading (…)l-00002-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 483MB/s]\u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin:  28% 115M/405M [00:00<00:00, 561MB/s] \u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin:  44% 178M/405M [00:00<00:00, 505MB/s]\u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin:  60% 241M/405M [00:00<00:00, 539MB/s]\u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin:  78% 315M/405M [00:00<00:00, 575MB/s]\u001b[A\n",
            "Downloading (…)l-00002-of-00033.bin: 100% 405M/405M [00:00<00:00, 552MB/s]\n",
            "Downloading shards:   6% 2/33 [00:02<00:31,  1.03s/it]\n",
            "Downloading (…)l-00003-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 564MB/s]\u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin:  31% 126M/405M [00:00<00:00, 555MB/s] \u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin:  47% 189M/405M [00:00<00:00, 560MB/s]\u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin:  62% 252M/405M [00:00<00:00, 554MB/s]\u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin:  78% 315M/405M [00:00<00:00, 547MB/s]\u001b[A\n",
            "Downloading (…)l-00003-of-00033.bin: 100% 405M/405M [00:00<00:00, 523MB/s]\n",
            "Downloading shards:   9% 3/33 [00:03<00:30,  1.03s/it]\n",
            "Downloading (…)l-00004-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 565MB/s]\u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin:  31% 126M/405M [00:00<00:00, 579MB/s] \u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin:  47% 189M/405M [00:00<00:00, 589MB/s]\u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin:  62% 252M/405M [00:00<00:00, 585MB/s]\u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin:  78% 315M/405M [00:00<00:00, 524MB/s]\u001b[A\n",
            "Downloading (…)l-00004-of-00033.bin: 100% 405M/405M [00:00<00:00, 553MB/s]\n",
            "Downloading shards:  12% 4/33 [00:04<00:29,  1.01s/it]\n",
            "Downloading (…)l-00005-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 511MB/s]\u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin:  28% 115M/405M [00:00<00:00, 557MB/s] \u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin:  44% 178M/405M [00:00<00:00, 561MB/s]\u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin:  60% 241M/405M [00:00<00:00, 535MB/s]\u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin:  75% 304M/405M [00:00<00:00, 530MB/s]\u001b[A\n",
            "Downloading (…)l-00005-of-00033.bin: 100% 405M/405M [00:00<00:00, 546MB/s]\n",
            "Downloading shards:  15% 5/33 [00:05<00:28,  1.00s/it]\n",
            "Downloading (…)l-00006-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 539MB/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  31% 126M/405M [00:00<00:00, 514MB/s] \u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  44% 178M/405M [00:00<00:00, 493MB/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  57% 231M/405M [00:00<00:00, 455MB/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  70% 283M/405M [00:00<00:00, 448MB/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin:  85% 346M/405M [00:00<00:00, 485MB/s]\u001b[A\n",
            "Downloading (…)l-00006-of-00033.bin: 100% 405M/405M [00:00<00:00, 489MB/s]\n",
            "Downloading shards:  18% 6/33 [00:06<00:27,  1.03s/it]\n",
            "Downloading (…)l-00007-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 568MB/s]\u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin:  31% 126M/405M [00:00<00:00, 592MB/s] \u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin:  47% 189M/405M [00:00<00:00, 581MB/s]\u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin:  62% 252M/405M [00:00<00:00, 534MB/s]\u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin:  78% 315M/405M [00:00<00:00, 513MB/s]\u001b[A\n",
            "Downloading (…)l-00007-of-00033.bin: 100% 405M/405M [00:00<00:00, 509MB/s]\n",
            "Downloading shards:  21% 7/33 [00:07<00:26,  1.04s/it]\n",
            "Downloading (…)l-00008-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 439MB/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  26% 105M/405M [00:00<00:00, 457MB/s] \u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  39% 157M/405M [00:00<00:00, 468MB/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  52% 210M/405M [00:00<00:00, 470MB/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  65% 262M/405M [00:00<00:00, 474MB/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin:  78% 315M/405M [00:00<00:00, 483MB/s]\u001b[A\n",
            "Downloading (…)l-00008-of-00033.bin: 100% 405M/405M [00:00<00:00, 494MB/s]\n",
            "Downloading shards:  24% 8/33 [00:08<00:26,  1.05s/it]\n",
            "Downloading (…)l-00009-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 588MB/s]\u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin:  31% 126M/405M [00:00<00:00, 602MB/s] \u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin:  47% 189M/405M [00:00<00:00, 608MB/s]\u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin:  62% 252M/405M [00:00<00:00, 614MB/s]\u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin:  78% 315M/405M [00:00<00:00, 591MB/s]\u001b[A\n",
            "Downloading (…)l-00009-of-00033.bin: 100% 405M/405M [00:00<00:00, 502MB/s]\n",
            "Downloading shards:  27% 9/33 [00:09<00:25,  1.05s/it]\n",
            "Downloading (…)l-00010-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:   5% 21.0M/405M [00:00<00:03, 106MB/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:  16% 62.9M/405M [00:00<00:01, 235MB/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:  31% 126M/405M [00:00<00:00, 361MB/s] \u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:  47% 189M/405M [00:00<00:00, 437MB/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:  62% 252M/405M [00:00<00:00, 491MB/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin:  78% 315M/405M [00:00<00:00, 519MB/s]\u001b[A\n",
            "Downloading (…)l-00010-of-00033.bin: 100% 405M/405M [00:00<00:00, 427MB/s]\n",
            "Downloading shards:  30% 10/33 [00:10<00:25,  1.10s/it]\n",
            "Downloading (…)l-00011-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 551MB/s]\u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin:  31% 126M/405M [00:00<00:00, 579MB/s] \u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin:  47% 189M/405M [00:00<00:00, 583MB/s]\u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin:  62% 252M/405M [00:00<00:00, 538MB/s]\u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin:  78% 315M/405M [00:00<00:00, 429MB/s]\u001b[A\n",
            "Downloading (…)l-00011-of-00033.bin: 100% 405M/405M [00:00<00:00, 423MB/s]\n",
            "Downloading shards:  33% 11/33 [00:11<00:24,  1.13s/it]\n",
            "Downloading (…)l-00012-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 580MB/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  31% 126M/405M [00:00<00:00, 583MB/s] \u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  47% 189M/405M [00:00<00:00, 588MB/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  62% 252M/405M [00:00<00:00, 333MB/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  75% 304M/405M [00:00<00:00, 370MB/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin:  88% 357M/405M [00:00<00:00, 393MB/s]\u001b[A\n",
            "Downloading (…)l-00012-of-00033.bin: 100% 405M/405M [00:00<00:00, 413MB/s]\n",
            "Downloading shards:  36% 12/33 [00:12<00:24,  1.16s/it]\n",
            "Downloading (…)l-00013-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  10% 41.9M/405M [00:00<00:00, 364MB/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  26% 105M/405M [00:00<00:00, 492MB/s] \u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  39% 157M/405M [00:00<00:00, 447MB/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  52% 210M/405M [00:00<00:00, 454MB/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  67% 273M/405M [00:00<00:00, 506MB/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin:  80% 325M/405M [00:00<00:00, 473MB/s]\u001b[A\n",
            "Downloading (…)l-00013-of-00033.bin: 100% 405M/405M [00:00<00:00, 464MB/s]\n",
            "Downloading shards:  39% 13/33 [00:14<00:22,  1.15s/it]\n",
            "Downloading (…)l-00014-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  26% 105M/405M [00:00<00:00, 466MB/s] \u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  41% 168M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  54% 220M/405M [00:00<00:00, 460MB/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  70% 283M/405M [00:00<00:00, 506MB/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin:  83% 336M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00014-of-00033.bin: 100% 405M/405M [00:00<00:00, 458MB/s]\n",
            "Downloading shards:  42% 14/33 [00:15<00:21,  1.14s/it]\n",
            "Downloading (…)l-00015-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 540MB/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  31% 126M/405M [00:00<00:00, 533MB/s] \u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  47% 189M/405M [00:00<00:00, 504MB/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  60% 241M/405M [00:00<00:00, 467MB/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  73% 294M/405M [00:00<00:00, 427MB/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin:  85% 346M/405M [00:00<00:00, 395MB/s]\u001b[A\n",
            "Downloading (…)l-00015-of-00033.bin: 100% 405M/405M [00:00<00:00, 408MB/s]\n",
            "Downloading shards:  45% 15/33 [00:16<00:21,  1.17s/it]\n",
            "Downloading (…)l-00016-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 482MB/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  26% 105M/405M [00:00<00:00, 501MB/s] \u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  39% 157M/405M [00:00<00:00, 509MB/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  52% 210M/405M [00:00<00:00, 513MB/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  65% 262M/405M [00:00<00:00, 488MB/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin:  78% 315M/405M [00:00<00:00, 499MB/s]\u001b[A\n",
            "Downloading (…)l-00016-of-00033.bin: 100% 405M/405M [00:00<00:00, 502MB/s]\n",
            "Downloading shards:  48% 16/33 [00:17<00:19,  1.14s/it]\n",
            "Downloading (…)l-00017-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 503MB/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  26% 105M/405M [00:00<00:00, 502MB/s] \u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  39% 157M/405M [00:00<00:00, 498MB/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  52% 210M/405M [00:00<00:00, 496MB/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  65% 262M/405M [00:00<00:00, 497MB/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin:  78% 315M/405M [00:00<00:00, 481MB/s]\u001b[A\n",
            "Downloading (…)l-00017-of-00033.bin: 100% 405M/405M [00:00<00:00, 488MB/s]\n",
            "Downloading shards:  52% 17/33 [00:18<00:18,  1.18s/it]\n",
            "Downloading (…)l-00018-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 536MB/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  31% 126M/405M [00:00<00:00, 462MB/s] \u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  44% 178M/405M [00:00<00:00, 477MB/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  57% 231M/405M [00:00<00:00, 488MB/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  70% 283M/405M [00:00<00:00, 493MB/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin:  83% 336M/405M [00:00<00:00, 498MB/s]\u001b[A\n",
            "Downloading (…)l-00018-of-00033.bin: 100% 405M/405M [00:00<00:00, 486MB/s]\n",
            "Downloading shards:  55% 18/33 [00:20<00:17,  1.19s/it]\n",
            "Downloading (…)l-00019-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 516MB/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  26% 105M/405M [00:00<00:00, 513MB/s] \u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  39% 157M/405M [00:00<00:00, 509MB/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  52% 210M/405M [00:00<00:00, 507MB/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  65% 262M/405M [00:00<00:00, 506MB/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin:  78% 315M/405M [00:00<00:00, 493MB/s]\u001b[A\n",
            "Downloading (…)l-00019-of-00033.bin: 100% 405M/405M [00:00<00:00, 451MB/s]\n",
            "Downloading shards:  58% 19/33 [00:21<00:16,  1.18s/it]\n",
            "Downloading (…)l-00020-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 512MB/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  26% 105M/405M [00:00<00:00, 500MB/s] \u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  39% 157M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  52% 210M/405M [00:00<00:00, 501MB/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  65% 262M/405M [00:00<00:00, 449MB/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin:  78% 315M/405M [00:00<00:00, 436MB/s]\u001b[A\n",
            "Downloading (…)l-00020-of-00033.bin: 100% 405M/405M [00:00<00:00, 451MB/s]\n",
            "Downloading shards:  61% 20/33 [00:22<00:15,  1.17s/it]\n",
            "Downloading (…)l-00021-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:   3% 10.5M/405M [00:00<00:08, 44.4MB/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  16% 62.9M/405M [00:00<00:01, 215MB/s] \u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  28% 115M/405M [00:00<00:00, 295MB/s] \u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  39% 157M/405M [00:00<00:00, 333MB/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  49% 199M/405M [00:00<00:00, 358MB/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  62% 252M/405M [00:00<00:00, 403MB/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin:  75% 304M/405M [00:00<00:00, 434MB/s]\u001b[A\n",
            "Downloading (…)l-00021-of-00033.bin: 100% 405M/405M [00:01<00:00, 371MB/s]\n",
            "Downloading shards:  64% 21/33 [00:23<00:14,  1.22s/it]\n",
            "Downloading (…)l-00022-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 483MB/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  26% 105M/405M [00:00<00:00, 495MB/s] \u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  39% 157M/405M [00:00<00:00, 498MB/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  52% 210M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  65% 262M/405M [00:00<00:00, 503MB/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin:  78% 315M/405M [00:00<00:00, 503MB/s]\u001b[A\n",
            "Downloading (…)l-00022-of-00033.bin: 100% 405M/405M [00:00<00:00, 501MB/s]\n",
            "Downloading shards:  67% 22/33 [00:24<00:12,  1.17s/it]\n",
            "Downloading (…)l-00023-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 513MB/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  28% 115M/405M [00:00<00:00, 523MB/s] \u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  44% 178M/405M [00:00<00:00, 526MB/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  60% 241M/405M [00:00<00:00, 495MB/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  73% 294M/405M [00:00<00:00, 500MB/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin:  85% 346M/405M [00:00<00:00, 503MB/s]\u001b[A\n",
            "Downloading (…)l-00023-of-00033.bin: 100% 405M/405M [00:00<00:00, 481MB/s]\n",
            "Downloading shards:  70% 23/33 [00:25<00:11,  1.15s/it]\n",
            "Downloading (…)l-00024-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 586MB/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  31% 126M/405M [00:00<00:00, 511MB/s] \u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  44% 178M/405M [00:00<00:00, 438MB/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  57% 231M/405M [00:00<00:00, 382MB/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  67% 273M/405M [00:00<00:00, 381MB/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin:  78% 315M/405M [00:00<00:00, 386MB/s]\u001b[A\n",
            "Downloading (…)l-00024-of-00033.bin: 100% 405M/405M [00:00<00:00, 416MB/s]\n",
            "Downloading shards:  73% 24/33 [00:27<00:10,  1.17s/it]\n",
            "Downloading (…)l-00025-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 596MB/s]\u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin:  31% 126M/405M [00:00<00:00, 579MB/s] \u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin:  47% 189M/405M [00:00<00:00, 553MB/s]\u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin:  62% 252M/405M [00:00<00:00, 554MB/s]\u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin:  78% 315M/405M [00:00<00:00, 545MB/s]\u001b[A\n",
            "Downloading (…)l-00025-of-00033.bin: 100% 405M/405M [00:00<00:00, 542MB/s]\n",
            "Downloading shards:  76% 25/33 [00:28<00:08,  1.12s/it]\n",
            "Downloading (…)l-00026-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 534MB/s]\u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin:  31% 126M/405M [00:00<00:00, 541MB/s] \u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin:  47% 189M/405M [00:00<00:00, 562MB/s]\u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin:  62% 252M/405M [00:00<00:00, 539MB/s]\u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin:  78% 315M/405M [00:00<00:00, 514MB/s]\u001b[A\n",
            "Downloading (…)l-00026-of-00033.bin: 100% 405M/405M [00:00<00:00, 521MB/s]\n",
            "Downloading shards:  79% 26/33 [00:29<00:07,  1.12s/it]\n",
            "Downloading (…)l-00027-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 591MB/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  31% 126M/405M [00:00<00:00, 602MB/s] \u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  49% 199M/405M [00:00<00:00, 616MB/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  65% 262M/405M [00:00<00:00, 440MB/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  78% 315M/405M [00:00<00:00, 360MB/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin:  88% 357M/405M [00:00<00:00, 324MB/s]\u001b[A\n",
            "Downloading (…)l-00027-of-00033.bin: 100% 405M/405M [00:01<00:00, 354MB/s]\n",
            "Downloading shards:  82% 27/33 [00:30<00:07,  1.20s/it]\n",
            "Downloading (…)l-00028-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 523MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  28% 115M/405M [00:00<00:00, 521MB/s] \u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  41% 168M/405M [00:00<00:00, 456MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  54% 220M/405M [00:00<00:00, 406MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  65% 262M/405M [00:00<00:00, 398MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  75% 304M/405M [00:00<00:00, 399MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin:  88% 357M/405M [00:00<00:00, 418MB/s]\u001b[A\n",
            "Downloading (…)l-00028-of-00033.bin: 100% 405M/405M [00:00<00:00, 427MB/s]\n",
            "Downloading shards:  85% 28/33 [00:31<00:05,  1.20s/it]\n",
            "Downloading (…)l-00029-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin:  16% 62.9M/405M [00:00<00:00, 508MB/s]\u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin:  28% 115M/405M [00:00<00:00, 501MB/s] \u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin:  44% 178M/405M [00:00<00:00, 512MB/s]\u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin:  60% 241M/405M [00:00<00:00, 517MB/s]\u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin:  75% 304M/405M [00:00<00:00, 520MB/s]\u001b[A\n",
            "Downloading (…)l-00029-of-00033.bin: 100% 405M/405M [00:00<00:00, 516MB/s]\n",
            "Downloading shards:  88% 29/33 [00:32<00:04,  1.15s/it]\n",
            "Downloading (…)l-00030-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 518MB/s]\u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin:  28% 115M/405M [00:00<00:00, 527MB/s] \u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin:  44% 178M/405M [00:00<00:00, 528MB/s]\u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin:  60% 241M/405M [00:00<00:00, 533MB/s]\u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin:  75% 304M/405M [00:00<00:00, 531MB/s]\u001b[A\n",
            "Downloading (…)l-00030-of-00033.bin: 100% 405M/405M [00:00<00:00, 527MB/s]\n",
            "Downloading shards:  91% 30/33 [00:33<00:03,  1.11s/it]\n",
            "Downloading (…)l-00031-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 521MB/s]\u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin:  28% 115M/405M [00:00<00:00, 530MB/s] \u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin:  44% 178M/405M [00:00<00:00, 528MB/s]\u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin:  60% 241M/405M [00:00<00:00, 556MB/s]\u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin:  75% 304M/405M [00:00<00:00, 578MB/s]\u001b[A\n",
            "Downloading (…)l-00031-of-00033.bin: 100% 405M/405M [00:00<00:00, 511MB/s]\n",
            "Downloading shards:  94% 31/33 [00:34<00:02,  1.09s/it]\n",
            "Downloading (…)l-00032-of-00033.bin:   0% 0.00/405M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  13% 52.4M/405M [00:00<00:00, 505MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  26% 105M/405M [00:00<00:00, 514MB/s] \u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  39% 157M/405M [00:00<00:00, 280MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  49% 199M/405M [00:00<00:00, 251MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  57% 231M/405M [00:00<00:00, 243MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  65% 262M/405M [00:00<00:00, 247MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  75% 304M/405M [00:01<00:00, 274MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin:  85% 346M/405M [00:01<00:00, 308MB/s]\u001b[A\n",
            "Downloading (…)l-00032-of-00033.bin: 100% 405M/405M [00:01<00:00, 303MB/s]\n",
            "Downloading shards:  97% 32/33 [00:36<00:01,  1.24s/it]\n",
            "Downloading (…)l-00033-of-00033.bin:   0% 0.00/524M [00:00<?, ?B/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  10% 52.4M/524M [00:00<00:00, 511MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  22% 115M/524M [00:00<00:00, 524MB/s] \u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  34% 178M/524M [00:00<00:00, 525MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  44% 231M/524M [00:00<00:00, 523MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  54% 283M/524M [00:00<00:00, 523MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  66% 346M/524M [00:00<00:00, 525MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  78% 409M/524M [00:00<00:00, 521MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin:  88% 461M/524M [00:00<00:00, 515MB/s]\u001b[A\n",
            "Downloading (…)l-00033-of-00033.bin: 100% 524M/524M [00:01<00:00, 504MB/s]\n",
            "Downloading shards: 100% 33/33 [00:37<00:00,  1.14s/it]\n",
            "[INFO|modeling_utils.py:1176] 2023-05-12 06:16:06,686 >> Instantiating LlamaForCausalLM model under default dtype torch.float16.\n",
            "[INFO|configuration_utils.py:575] 2023-05-12 06:16:06,687 >> Generate config GenerationConfig {\n",
            "  \"_from_model_config\": true,\n",
            "  \"bos_token_id\": 0,\n",
            "  \"eos_token_id\": 1,\n",
            "  \"pad_token_id\": -1,\n",
            "  \"transformers_version\": \"4.28.1\"\n",
            "}\n",
            "\n",
            "Loading checkpoint shards: 100% 33/33 [00:14<00:00,  2.35it/s]\n",
            "[INFO|modeling_utils.py:3190] 2023-05-12 06:16:23,699 >> All model checkpoint weights were used when initializing LlamaForCausalLM.\n",
            "\n",
            "[INFO|modeling_utils.py:3198] 2023-05-12 06:16:23,699 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at decapoda-research/llama-7b-hf.\n",
            "If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.\n",
            "Downloading (…)neration_config.json: 100% 124/124 [00:00<00:00, 781kB/s]\n",
            "[INFO|configuration_utils.py:537] 2023-05-12 06:16:24,167 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/generation_config.json\n",
            "[INFO|configuration_utils.py:575] 2023-05-12 06:16:24,167 >> Generate config GenerationConfig {\n",
            "  \"_from_model_config\": true,\n",
            "  \"bos_token_id\": 0,\n",
            "  \"eos_token_id\": 1,\n",
            "  \"pad_token_id\": 0,\n",
            "  \"transformers_version\": \"4.28.1\"\n",
            "}\n",
            "\n",
            "05/12/2023 06:16:56 - INFO - __main__ - Init new peft model\n",
            "['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj']\n",
            "8\n",
            "trainable params: 429203456 || all params: 6905475072 || trainable%: 6.2154080859739\n",
            "[INFO|trainer.py:564] 2023-05-12 06:18:27,733 >> max_steps is given, it will override any value given in num_train_epochs\n",
            "[INFO|trainer.py:621] 2023-05-12 06:18:27,734 >> Using cuda_amp half precision backend\n",
            "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
            "  warnings.warn(\n",
            "[2023-05-12 06:18:27,759] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.9.2, git-hash=unknown, git-branch=unknown\n",
            "05/12/2023 06:18:31 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:2 to store for rank: 0\n",
            "05/12/2023 06:18:31 - INFO - torch.distributed.distributed_c10d - Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.\n",
            "[2023-05-12 06:18:31,488] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n",
            "[2023-05-12 06:18:31,489] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer\n",
            "[2023-05-12 06:18:31,489] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer\n",
            "[2023-05-12 06:18:31,533] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW\n",
            "[2023-05-12 06:18:31,534] [INFO] [utils.py:54:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'transformers.optimization.AdamW'>\n",
            "[2023-05-12 06:18:31,534] [WARNING] [engine.py:1104:_do_optimizer_sanity_check] **** You are using ZeRO with an untested optimizer, proceed with caution *****\n",
            "[2023-05-12 06:18:31,534] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer\n",
            "[2023-05-12 06:18:31,534] [INFO] [stage_1_and_2.py:133:__init__] Reduce bucket size 100000000\n",
            "[2023-05-12 06:18:31,534] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 100000000\n",
            "[2023-05-12 06:18:31,534] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False\n",
            "[2023-05-12 06:18:31,534] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False\n",
            "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
            "Creating extension directory /root/.cache/torch_extensions/py310_cu118/utils...\n",
            "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/utils/build.ninja...\n",
            "Building extension module utils...\n",
            "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
            "[1/2] c++ -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o \n",
            "[2/2] c++ flatten_unflatten.o -shared -L/usr/local/lib/python3.10/dist-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so\n",
            "Loading extension module utils...\n",
            "Time to load utils op: 18.753242015838623 seconds\n",
            "Rank: 0 partition count [1] and sizes[(429203456, False)] \n",
            "[2023-05-12 06:18:52,360] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states\n",
            "[2023-05-12 06:18:52,361] [INFO] [utils.py:786:see_memory_usage] MA 14.49 GB         Max_MA 15.29 GB         CA 15.33 GB         Max_CA 15 GB \n",
            "[2023-05-12 06:18:52,361] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 4.4 GB, percent = 5.3%\n",
            "[2023-05-12 06:18:52,592] [INFO] [utils.py:785:see_memory_usage] After initializing optimizer states\n",
            "[2023-05-12 06:18:52,592] [INFO] [utils.py:786:see_memory_usage] MA 17.69 GB         Max_MA 20.89 GB         CA 21.73 GB         Max_CA 22 GB \n",
            "[2023-05-12 06:18:52,593] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 4.4 GB, percent = 5.3%\n",
            "[2023-05-12 06:18:52,593] [INFO] [stage_1_and_2.py:489:__init__] optimizer state initialized\n",
            "[2023-05-12 06:18:52,803] [INFO] [utils.py:785:see_memory_usage] After initializing ZeRO optimizer\n",
            "[2023-05-12 06:18:52,804] [INFO] [utils.py:786:see_memory_usage] MA 17.69 GB         Max_MA 17.69 GB         CA 21.73 GB         Max_CA 22 GB \n",
            "[2023-05-12 06:18:52,804] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 4.4 GB, percent = 5.3%\n",
            "[2023-05-12 06:18:52,818] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW\n",
            "[2023-05-12 06:18:52,818] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler\n",
            "[2023-05-12 06:18:52,818] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f628451abc0>\n",
            "[2023-05-12 06:18:52,818] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)]\n",
            "[2023-05-12 06:18:52,820] [INFO] [config.py:955:print] DeepSpeedEngine configuration:\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   activation_checkpointing_config  {\n",
            "    \"partition_activations\": false, \n",
            "    \"contiguous_memory_optimization\": false, \n",
            "    \"cpu_checkpointing\": false, \n",
            "    \"number_checkpoints\": null, \n",
            "    \"synchronize_checkpoint_boundary\": false, \n",
            "    \"profile\": false\n",
            "}\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   amp_enabled .................. False\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   amp_params ................... False\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   autotuning_config ............ {\n",
            "    \"enabled\": false, \n",
            "    \"start_step\": null, \n",
            "    \"end_step\": null, \n",
            "    \"metric_path\": null, \n",
            "    \"arg_mappings\": null, \n",
            "    \"metric\": \"throughput\", \n",
            "    \"model_info\": null, \n",
            "    \"results_dir\": \"autotuning_results\", \n",
            "    \"exps_dir\": \"autotuning_exps\", \n",
            "    \"overwrite\": true, \n",
            "    \"fast\": true, \n",
            "    \"start_profile_step\": 3, \n",
            "    \"end_profile_step\": 5, \n",
            "    \"tuner_type\": \"gridsearch\", \n",
            "    \"tuner_early_stopping\": 5, \n",
            "    \"tuner_num_trials\": 50, \n",
            "    \"model_info_path\": null, \n",
            "    \"mp_size\": 1, \n",
            "    \"max_train_batch_size\": null, \n",
            "    \"min_train_batch_size\": 1, \n",
            "    \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n",
            "    \"min_train_micro_batch_size_per_gpu\": 1, \n",
            "    \"num_tuning_micro_batch_sizes\": 3\n",
            "}\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   bfloat16_enabled ............. False\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   checkpoint_parallel_write_pipeline  False\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   checkpoint_tag_validation_enabled  True\n",
            "[2023-05-12 06:18:52,821] [INFO] [config.py:959:print]   checkpoint_tag_validation_fail  False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f628451b7c0>\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   communication_data_type ...... None\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   curriculum_enabled_legacy .... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   curriculum_params_legacy ..... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   data_efficiency_enabled ...... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   dataloader_drop_last ......... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   disable_allgather ............ False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   dump_state ................... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'min_scale': 1e-10}\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_enabled ........... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_gas_boundary_resolution  1\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_layer_name ........ bert.encoder.layer\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_layer_num ......... 0\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_max_iter .......... 100\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_stability ......... 1e-06\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_tol ............... 0.01\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   eigenvalue_verbose ........... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   elasticity_enabled ........... False\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   flops_profiler_config ........ {\n",
            "    \"enabled\": false, \n",
            "    \"profile_step\": 1, \n",
            "    \"module_depth\": -1, \n",
            "    \"top_modules\": 1, \n",
            "    \"detailed\": true, \n",
            "    \"output_file\": null\n",
            "}\n",
            "[2023-05-12 06:18:52,822] [INFO] [config.py:959:print]   fp16_auto_cast ............... False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   fp16_enabled ................. True\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   fp16_master_weights_and_gradients  False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   global_rank .................. 0\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   grad_accum_dtype ............. None\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   gradient_accumulation_steps .. 1\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   gradient_clipping ............ 1.0\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   gradient_predivide_factor .... 1.0\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   initial_dynamic_scale ........ 65536\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   load_universal_checkpoint .... False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   loss_scale ................... 0\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   memory_breakdown ............. False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   mics_hierarchial_params_gather  False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   mics_shard_size .............. -1\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   nebula_config ................ {\n",
            "    \"enabled\": false, \n",
            "    \"persistent_storage_path\": null, \n",
            "    \"persistent_time_interval\": 100, \n",
            "    \"num_of_version_in_retention\": 2, \n",
            "    \"enable_nebula_load\": true, \n",
            "    \"load_path\": null\n",
            "}\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   optimizer_legacy_fusion ...... False\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   optimizer_name ............... None\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   optimizer_params ............. None\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}\n",
            "[2023-05-12 06:18:52,823] [INFO] [config.py:959:print]   pld_enabled .................. False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   pld_params ................... False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   prescale_gradients ........... False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   scheduler_name ............... None\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   scheduler_params ............. None\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   sparse_attention ............. None\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   sparse_gradients_enabled ..... False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   steps_per_print .............. 2000\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   train_batch_size ............. 1\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   train_micro_batch_size_per_gpu  1\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   use_node_local_storage ....... False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   wall_clock_breakdown ......... False\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   world_size ................... 1\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   zero_allow_untested_optimizer  True\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=100000000 allgather_partitions=True allgather_bucket_size=100000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   zero_enabled ................. True\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   zero_force_ds_cpu_optimizer .. True\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:959:print]   zero_optimization_stage ...... 2\n",
            "[2023-05-12 06:18:52,824] [INFO] [config.py:945:print_user_config]   json = {\n",
            "    \"fp16\": {\n",
            "        \"enabled\": true, \n",
            "        \"loss_scale\": 0, \n",
            "        \"loss_scale_window\": 100, \n",
            "        \"initial_scale_power\": 16, \n",
            "        \"hysteresis\": 2, \n",
            "        \"min_loss_scale\": 1e-10\n",
            "    }, \n",
            "    \"zero_optimization\": {\n",
            "        \"stage\": 2, \n",
            "        \"allgather_partitions\": true, \n",
            "        \"allgather_bucket_size\": 1.000000e+08, \n",
            "        \"overlap_comm\": true, \n",
            "        \"reduce_scatter\": true, \n",
            "        \"reduce_bucket_size\": 1.000000e+08, \n",
            "        \"contiguous_gradients\": true\n",
            "    }, \n",
            "    \"gradient_accumulation_steps\": 1, \n",
            "    \"gradient_clipping\": 1.0, \n",
            "    \"steps_per_print\": 2.000000e+03, \n",
            "    \"train_batch_size\": 1, \n",
            "    \"train_micro_batch_size_per_gpu\": 1, \n",
            "    \"wall_clock_breakdown\": false, \n",
            "    \"zero_allow_untested_optimizer\": true\n",
            "}\n",
            "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
            "No modifications detected for re-loaded extension module utils, skipping build step...\n",
            "Loading extension module utils...\n",
            "Time to load utils op: 0.00040268898010253906 seconds\n",
            "[INFO|trainer.py:1769] 2023-05-12 06:18:52,827 >> ***** Running training *****\n",
            "[INFO|trainer.py:1770] 2023-05-12 06:18:52,827 >>   Num examples = 6,906\n",
            "[INFO|trainer.py:1771] 2023-05-12 06:18:52,827 >>   Num Epochs = 1\n",
            "[INFO|trainer.py:1772] 2023-05-12 06:18:52,827 >>   Instantaneous batch size per device = 1\n",
            "[INFO|trainer.py:1773] 2023-05-12 06:18:52,827 >>   Total train batch size (w. parallel, distributed & accumulation) = 1\n",
            "[INFO|trainer.py:1774] 2023-05-12 06:18:52,827 >>   Gradient Accumulation steps = 1\n",
            "[INFO|trainer.py:1775] 2023-05-12 06:18:52,827 >>   Total optimization steps = 100\n",
            "[INFO|trainer.py:1776] 2023-05-12 06:18:52,830 >>   Number of trainable parameters = 429,203,456\n",
            "  0% 0/100 [00:00<?, ?it/s][WARNING|logging.py:295] 2023-05-12 06:18:52,874 >> `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n",
            "[2023-05-12 06:18:56,313] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1\n",
            "{'loss': 10.3047, 'learning_rate': 0.0, 'epoch': 0.0}\n",
            "  1% 1/100 [00:03<05:43,  3.47s/it][2023-05-12 06:18:56,654] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768\n",
            "  2% 2/100 [00:03<02:39,  1.63s/it][2023-05-12 06:18:56,989] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384\n",
            "  4% 4/100 [00:04<01:16,  1.25it/s][2023-05-12 06:18:57,753] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192\n",
            "  6% 6/100 [00:05<00:52,  1.78it/s][2023-05-12 06:18:58,534] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096\n",
            "{'loss': 10.4913, 'learning_rate': 0.0002, 'epoch': 0.0}\n",
            "{'loss': 8.243, 'learning_rate': 0.00019458172417006347, 'epoch': 0.0}\n",
            " 25% 25/100 [00:13<00:32,  2.30it/s][2023-05-12 06:19:06,764] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048\n",
            "{'loss': 7.6727, 'learning_rate': 0.00018090169943749476, 'epoch': 0.0}\n",
            "{'loss': 7.4988, 'learning_rate': 0.00015743286626829437, 'epoch': 0.01}\n",
            "{'loss': 7.0074, 'learning_rate': 0.00012774029087618446, 'epoch': 0.01}\n",
            " 50% 50/100 [00:24<00:21,  2.31it/s][INFO|trainer.py:2868] 2023-05-12 06:19:17,175 >> Saving model checkpoint to /content/output_model/checkpoint-50\n",
            "[INFO|trainer.py:2880] 2023-05-12 06:19:17,182 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n",
            "[INFO|tokenization_utils_base.py:2171] 2023-05-12 06:19:18,702 >> tokenizer config file saved in /content/output_model/checkpoint-50/tokenizer_config.json\n",
            "[INFO|tokenization_utils_base.py:2178] 2023-05-12 06:19:18,702 >> Special tokens file saved in /content/output_model/checkpoint-50/special_tokens_map.json\n",
            "[2023-05-12 06:19:18,704] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step50 is about to be saved!\n",
            "[2023-05-12 06:19:27,197] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /content/output_model/checkpoint-50/global_step50/mp_rank_00_model_states.pt\n",
            "[2023-05-12 06:19:27,197] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /content/output_model/checkpoint-50/global_step50/mp_rank_00_model_states.pt...\n",
            "[2023-05-12 06:20:06,793] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /content/output_model/checkpoint-50/global_step50/mp_rank_00_model_states.pt.\n",
            "[2023-05-12 06:20:07,322] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /content/output_model/checkpoint-50/global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
            "[2023-05-12 06:20:21,414] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /content/output_model/checkpoint-50/global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
            "[2023-05-12 06:20:21,415] [INFO] [engine.py:3228:_save_zero_checkpoint] zero checkpoint saved /content/output_model/checkpoint-50/global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
            "[2023-05-12 06:20:21,415] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step50 is ready now!\n",
            "{'loss': 7.0254, 'learning_rate': 9.504162453267777e-05, 'epoch': 0.01}\n",
            " 66% 66/100 [01:35<00:17,  1.90it/s][2023-05-12 06:20:28,718] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024\n",
            "{'loss': 6.607, 'learning_rate': 6.59706825558357e-05, 'epoch': 0.01}\n",
            "{'loss': 6.4305, 'learning_rate': 3.7282364152646297e-05, 'epoch': 0.01}\n",
            "{'loss': 6.5699, 'learning_rate': 1.5390474757906446e-05, 'epoch': 0.01}\n",
            "{'loss': 5.8727, 'learning_rate': 2.667340275199426e-06, 'epoch': 0.01}\n",
            "100% 100/100 [01:50<00:00,  2.32it/s][INFO|trainer.py:2868] 2023-05-12 06:20:42,974 >> Saving model checkpoint to /content/output_model/checkpoint-100\n",
            "[INFO|trainer.py:2880] 2023-05-12 06:20:42,981 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n",
            "[INFO|tokenization_utils_base.py:2171] 2023-05-12 06:20:44,510 >> tokenizer config file saved in /content/output_model/checkpoint-100/tokenizer_config.json\n",
            "[INFO|tokenization_utils_base.py:2178] 2023-05-12 06:20:44,510 >> Special tokens file saved in /content/output_model/checkpoint-100/special_tokens_map.json\n",
            "[2023-05-12 06:20:44,512] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step100 is about to be saved!\n",
            "[2023-05-12 06:20:53,180] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /content/output_model/checkpoint-100/global_step100/mp_rank_00_model_states.pt\n",
            "[2023-05-12 06:20:53,180] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /content/output_model/checkpoint-100/global_step100/mp_rank_00_model_states.pt...\n",
            "[2023-05-12 06:21:31,680] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /content/output_model/checkpoint-100/global_step100/mp_rank_00_model_states.pt.\n",
            "[2023-05-12 06:21:32,241] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /content/output_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
            "[2023-05-12 06:21:46,217] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /content/output_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
            "[2023-05-12 06:21:46,218] [INFO] [engine.py:3228:_save_zero_checkpoint] zero checkpoint saved /content/output_model/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
            "[2023-05-12 06:21:46,218] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now!\n",
            "[INFO|trainer.py:2039] 2023-05-12 06:21:46,220 >> \n",
            "\n",
            "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
            "\n",
            "\n",
            "{'train_runtime': 173.3904, 'train_samples_per_second': 0.577, 'train_steps_per_second': 0.577, 'train_loss': 7.34, 'epoch': 0.01}\n",
            "100% 100/100 [02:54<00:00,  1.75s/it]\n",
            "[INFO|trainer.py:2868] 2023-05-12 06:21:47,497 >> Saving model checkpoint to /content/output_model\n",
            "[INFO|trainer.py:2880] 2023-05-12 06:21:47,504 >> Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n",
            "[INFO|tokenization_utils_base.py:2171] 2023-05-12 06:21:49,250 >> tokenizer config file saved in /content/output_model/tokenizer_config.json\n",
            "[INFO|tokenization_utils_base.py:2178] 2023-05-12 06:21:49,250 >> Special tokens file saved in /content/output_model/special_tokens_map.json\n",
            "***** train metrics *****\n",
            "  epoch                    =       0.01\n",
            "  train_loss               =       7.34\n",
            "  train_runtime            = 0:02:53.39\n",
            "  train_samples            =       6906\n",
            "  train_samples_per_second =      0.577\n",
            "  train_steps_per_second   =      0.577\n"
          ]
        }
      ],
      "source": [
        "!cd Chinese-LLaMA-Alpaca/scripts && torchrun --nnodes 1 --nproc_per_node 1 run_clm_pt_with_peft.py \\\n",
        "    --deepspeed ds_zero2_no_offload.json \\\n",
        "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
        "    --tokenizer_name_or_path ziqingyang/chinese-llama-lora-7b \\\n",
        "    --dataset_dir /content/Chinese-LLaMA-Alpaca/pt_data \\\n",
        "    --data_cache_dir data_cache \\\n",
        "    --validation_split_percentage 0.001 \\\n",
        "    --per_device_train_batch_size 1 \\\n",
        "    --do_train \\\n",
        "    --fp16 \\\n",
        "    --seed $RANDOM \\\n",
        "    --max_steps 100 \\\n",
        "    --lr_scheduler_type cosine \\\n",
        "    --learning_rate 2e-4 \\\n",
        "    --warmup_ratio 0.05 \\\n",
        "    --weight_decay 0.01 \\\n",
        "    --logging_strategy steps \\\n",
        "    --logging_steps 10 \\\n",
        "    --save_strategy steps \\\n",
        "    --save_total_limit 3 \\\n",
        "    --save_steps 50 \\\n",
        "    --gradient_accumulation_steps 1 \\\n",
        "    --preprocessing_num_workers 8 \\\n",
        "    --block_size 512 \\\n",
        "    --output_dir /content/output_model \\\n",
        "    --overwrite_output_dir \\\n",
        "    --ddp_timeout 30000 \\\n",
        "    --logging_first_step True \\\n",
        "    --lora_rank 8 \\\n",
        "    --lora_alpha 32\\\n",
        "    --trainable q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj \\\n",
        "    --modules_to_save embed_tokens,lm_head \\\n",
        "    --lora_dropout 0.05 \\\n",
        "    --torch_dtype float16 \\\n",
        "    --gradient_checkpointing \\\n",
        "    --ddp_find_unused_parameters False"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "hvVWDy9YPzG1"
      },
      "source": [
        "After training, rename saved `pytorch_model.bin` to `adapter_model.bin`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "KnA4qnBCX3ev"
      },
      "outputs": [],
      "source": [
        "!mkdir output_model/peft_model\n",
        "!mv output_model/pytorch_model.bin output_model/peft_model/adapter_model.bin"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "hDRJlD8sYs7E"
      },
      "source": [
        "Lastly, you need to manually create an `adapter_config.json` under `peft_model` and fill in the hyperparamters such as `lora_rank`, `lora_alpha` etc., whose content and \n",
        "format can be referenced from the corresponding file in Chinese-LLaMA-LoRA."
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "A100",
      "machine_shape": "hm",
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
